
## Load packages ##
library(readxl)
library(haven)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(countrycode)

## Important notes: ##

# We used 2021 values for 27 countries missing data in year of survey
# Zanzibar uses Tanzania income
# England uses United Kingdom income
# 5 countries (see below) have diverging indicator/data source
# The smallest countries have labels that are slightly larger than proportional

## Load data ##
data_hpacc <- read_excel("/[paste path]/Data.xlsx")
data_WB <- read_excel("[paste path]/WB_GNI_per cap_2021Int$.xls")

## Merge & clean data ##        
# Clean data: pre-merge
#WB
data_WB <-subset(data_WB, select = c("country", "country_ISO", "2009", "2010", "2011", "2012", "2013", "2014",
                                     "2015", "2016", "2017", "2018", "2019", "2020", "2021")) 
data_WB$country <- as.factor(data_WB$country)
data_WB$country <- as.factor(data_WB$country_ISO)
is.numeric(data_WB$"2009")

#HPACC: Add ISO codes
data_hpacc$country <- recode(data_hpacc$country, "England" = "United Kingdom", "STP" = "Sao Tome and Principe")
data_hpacc$country_ISO <- countrycode(data_hpacc$country, "country.name", "iso3c")   
data_hpacc$country_ISO[data_hpacc$country == "Zanzibar"] <- "TZA"       # Use Tanzania GNI for Zanzibar

# Reshape World Bank data to long format
data_WB_long <- data_WB %>%
  pivot_longer(
    cols = -c(country, country_ISO),     
    names_to = "year",        
    values_to = "income"   
  ) %>%
  mutate(year = as.integer(year)) 

# Merge data          // Use GNI in year of data collection; if not available, use 2021 data
data_merge <- data_hpacc %>%
  # First, attempt to join on the exact year
  left_join(data_WB_long, by = c("country_ISO", "year")) %>%
  # Add income for 2021 as a fallback
  left_join(data_WB_long %>% filter(year == 2021) %>% select(country_ISO, income_2021 = income), 
            by = "country_ISO") %>%
  # Use income from the respective year, or fallback to 2021 if NA
  mutate(income = ifelse(is.na(income), income_2021, income)) %>%
  select(-income_2021)

names(data_merge)[names(data_merge) == "country.x"] <- "country"

# Add estimates for 6 countries without data 
data_merge$income[data_merge$country == "Eritrea"] <- 1580           # 2010 (= survey year) GDP per capita in CURRENT Int$: https://data.worldbank.org/indicator/NY.GNP.PCAP.PP.CD?locations=ER
data_merge$income[data_merge$country == "Venezuela"] <- 16910         # 2011 (most recent, 4 years before survey) GDP per capita in CURRENT Int$: https://data.worldbank.org/indicator/NY.GNP.PCAP.PP.CD?locations=VE

data_merge$income[data_merge$country == "Tokelau"] <- 7445            # All pacific islands use data on !GDP per capita in !USD (year not specified) from: https://stats.pacificdata.org/?lc=en
data_merge$income[data_merge$country == "Wallis and Futuna"] <- 11674 
data_merge$income[data_merge$country == "Cook Islands"] <- 18806 
data_merge$income[data_merge$country == "Niue"] <- 19464 

# Clean data: post-merge
names(data_merge)[names(data_merge) == "country.x"] <- "country"
data_merge$income <- as.numeric(data_merge$income)
data_merge$income_log <- log(data_merge$income )
data_merge$worldregion7 <- as.factor(data_merge$worldregion7)

# Add labels
region_labels <- c(
  "1" = "Europe & North America",
  "2" = "Central Asia",
  "3" = "Middle East & North Africa",
  "4" = "Latin America & Caribbean",
  "5" = "Pacific island nations",
  "6" = "South, East & Southeast Asia",
  "7" = "Sub-Saharan Africa",
  "8" = "Global"
)

# Export to dta for Stata import 
stata_exp <- data_merge %>% select(country, income, income_log)
stata_exp$country <- recode(stata_exp$country, "United Kingdom" = "England", "Sao Tome and Principe" = "STP")
write_dta(stata_exp, "GNI_R_export.dta")

# Compute R-squared
model <- lm(glp1_elig_rel ~ income_log, data = data_merge)
r_squared <- summary(model)$r.squared

## Create plot ##           // Note: The smallest countries have labels that are slightly larger than proportional, so you still see them
bubble <- ggplot(data_merge, aes(x = income_log, y = glp1_elig_rel,  color = worldregion7, size = Pop_size_25to64)) +
  geom_smooth(method = "lm", se = FALSE, color = "grey40", size = "longdash", linewidth = 0.6, position = "identity") + 
  annotate("text", x = 7.3, y = 90, label = paste0("R² = ", round(r_squared, 3)), size = 4, hjust = 0) +
  geom_point(alpha = 0.5) +
  scale_color_hue(labels = region_labels) +
  geom_text_repel(aes(label = country, color = worldregion7), size = 3, show.legend = FALSE) +
  scale_size_continuous(name = "Population size (millions)", range = c(0.4, 25), breaks = c(50, 200, 400, 600, 800)) +
  labs(x = "Log per capita income (2021 International-$)", y = "GLP-1 receptor agonist eligibility (%)", color = "World region") +
  theme_classic() +
  theme(
    legend.position = "right",
    axis.title.y = element_text(color = "black", size = 11, face = "bold"),
    axis.text.y = element_text(color = "black", size = 11),
    axis.text.x = element_text(color = "black", size = 11),
    axis.title.x = element_text(color = "black", size = 11, face = "bold")) +
  guides(color = guide_legend(order = 1), size = guide_legend(order = 2)) +
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10), expand = c(0, 0)) +
  scale_x_continuous(limits = c(6.95, 12), breaks = seq(7, 12, by = 1), expand = c(0.005, 0.02)) 

ggsave("Figure 3_bubble.pdf", plot = bubble, width = 14, height = 10)

